import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv(r'C:\Users\princ\Desktop\School PDFs\Практика\music_recommend\dataset\data.csv')
genre_data = pd.read_csv(r'C:\Users\princ\Desktop\School PDFs\Практика\music_recommend\dataset\data_by_genres.csv')
year_data = pd.read_csv(r'C:\Users\princ\Desktop\School PDFs\Практика\music_recommend\dataset\data_by_year.csv')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 170653 entries, 0 to 170652 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 valence 170653 non-null float64 1 year 170653 non-null int64 2 acousticness 170653 non-null float64 3 artists 170653 non-null object 4 danceability 170653 non-null float64 5 duration_ms 170653 non-null int64 6 energy 170653 non-null float64 7 explicit 170653 non-null int64 8 id 170653 non-null object 9 instrumentalness 170653 non-null float64 10 key 170653 non-null int64 11 liveness 170653 non-null float64 12 loudness 170653 non-null float64 13 mode 170653 non-null int64 14 name 170653 non-null object 15 popularity 170653 non-null int64 16 release_date 170653 non-null object 17 speechiness 170653 non-null float64 18 tempo 170653 non-null float64 dtypes: float64(9), int64(6), object(4) memory usage: 24.7+ MB
genre_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2973 entries, 0 to 2972 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mode 2973 non-null int64 1 genres 2973 non-null object 2 acousticness 2973 non-null float64 3 danceability 2973 non-null float64 4 duration_ms 2973 non-null float64 5 energy 2973 non-null float64 6 instrumentalness 2973 non-null float64 7 liveness 2973 non-null float64 8 loudness 2973 non-null float64 9 speechiness 2973 non-null float64 10 tempo 2973 non-null float64 11 valence 2973 non-null float64 12 popularity 2973 non-null float64 13 key 2973 non-null int64 dtypes: float64(11), int64(2), object(1) memory usage: 325.3+ KB
year_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mode 100 non-null int64 1 year 100 non-null int64 2 acousticness 100 non-null float64 3 danceability 100 non-null float64 4 duration_ms 100 non-null float64 5 energy 100 non-null float64 6 instrumentalness 100 non-null float64 7 liveness 100 non-null float64 8 loudness 100 non-null float64 9 speechiness 100 non-null float64 10 tempo 100 non-null float64 11 valence 100 non-null float64 12 popularity 100 non-null float64 13 key 100 non-null int64 dtypes: float64(11), int64(3) memory usage: 11.1 KB
!pip install yellowbrick
Requirement already satisfied: yellowbrick in c:\users\princ\anaconda3\lib\site-packages (1.5) Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\princ\anaconda3\lib\site-packages (from yellowbrick) (1.2.1) Requirement already satisfied: scipy>=1.0.0 in c:\users\princ\anaconda3\lib\site-packages (from yellowbrick) (1.10.0) Requirement already satisfied: cycler>=0.10.0 in c:\users\princ\anaconda3\lib\site-packages (from yellowbrick) (0.11.0) Requirement already satisfied: numpy>=1.16.0 in c:\users\princ\anaconda3\lib\site-packages (from yellowbrick) (1.23.5) Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\princ\anaconda3\lib\site-packages (from yellowbrick) (3.7.0) Requirement already satisfied: python-dateutil>=2.7 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2) Requirement already satisfied: packaging>=20.0 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (22.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.9) Requirement already satisfied: contourpy>=1.0.1 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.0.5) Requirement already satisfied: pillow>=6.2.0 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.4.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\princ\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4) Requirement already satisfied: joblib>=1.1.1 in c:\users\princ\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.1.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\princ\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (2.2.0) Requirement already satisfied: six>=1.5 in c:\users\princ\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
from yellowbrick.target import FeatureCorrelation
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']
X, y = data[feature_names], data['popularity']
# Create a list of the feature names
features = np.array(feature_names)
# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)
plt.rcParams['figure.figsize']=(20,20)
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.show()
<Axes: title={'center': 'Features correlation with dependent variable'}, xlabel='Pearson Correlation'>
def get_decade(year):
period_start = int(year/10) * 10
decade = '{}s'.format(period_start)
return decade
data['decade'] = data['year'].apply(get_decade)
# sns.set(rc={'figure.figsize':(11 ,6)})
# sns.countplot(data['decade'])
import matplotlib.pyplot as plt
decade_counts = data['decade'].value_counts().sort_index()
plt.figure(figsize=(11, 6))
decade_counts.plot(kind='bar')
plt.xlabel('Decade')
plt.ylabel('Count')
plt.title('Count of Songs by Decade')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
decade_counts = data['decade'].value_counts().sort_index()
# Define a color map
colors = plt.cm.get_cmap('Set3', len(decade_counts))
# Plot the bar chart
plt.figure(figsize=(11, 6))
plt.bar(decade_counts.index, decade_counts.values, color=colors(np.arange(len(decade_counts))))
plt.xlabel('Decade')
plt.ylabel('Count')
plt.title('Count of Songs by Decade')
plt.show()
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=sound_features)
fig.show()
This dataset contains the audio features for different songs along with the audio features for different genres. We can use this information to compare different genres and understand their unique differences in sound.
top10_genres = genre_data.nlargest(10, 'popularity')
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()
Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)
# Visualizing the Clusters with t-SNE
from sklearn.manifold import TSNE
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 2973 samples in 0.015s... [t-SNE] Computed neighbors for 2973 samples in 0.333s... [t-SNE] Computed conditional probabilities for sample 1000 / 2973 [t-SNE] Computed conditional probabilities for sample 2000 / 2973 [t-SNE] Computed conditional probabilities for sample 2973 / 2973 [t-SNE] Mean sigma: 0.777516 [t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106194 [t-SNE] KL divergence after 1000 iterations: 1.392006
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20,
verbose=False))
], verbose=False)
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()